################################################################################
## Group Identities and Parliamentary Debates: Replication package
## Fiva, Nedregård and Øien (2024)

## Description:

# This file makes the word propensity (q), posterior (rho) and divergence (pi)
# for social background, used in Figure A.13
## # Description:
# - Reads in the dtm and use applied distributed multinomial regression (dmr) to 
# estimate - divergence across white collar/blue collar
# Output is the word propensity (q), posterior (rho) and divergence (pi)

# This code is based on the June 2024 version of "Sample Code" 
# provided on the following webpage:
# https://scholar.harvard.edu/shapiro/publications
# Under: 
# 2019 Gentzkow, Matthew, Jesse M. Shapiro, and Matt Taddy. 
# “Measuring group differences in high-dimensional choices: 
# Method and application to congressional speech.” 
# Econometrica 87, no. 4 (2019): 1307-1340.

################################################################################

# Clear memory and work environment

rm(list = ls())

gc()

# Packages

library(data.table)
library(parallel)
library(distrom)

## The wd is set by master.R

data.dir   <-  "../data/2_processed_data"
output.dir <- "../data/3_model_output"


# Read data----

## Meta
speaker_metadata <- fread(paste(data.dir, "speeches_session_lemma.csv", sep = "/"), 
                          drop = "text",
                          encoding="UTF-8")

speaker_metadata <- speaker_metadata[!is.na(occupation)]

## Document term matrix
dtm.counts <- readRDS(paste(data.dir, "document_term_matrix.rds", sep = "/"))

# Dropping speaches of speakers without information on social background
dtm.counts <- dtm.counts[rownames(dtm.counts) %in% speaker_metadata$pid_session,]

# Using the equation notation as in the paper.
## Our indicator category (A) is right-wing politician, and our reference 
## category (B) is left-wing politicians

speaker_metadata[, A := dplyr::if_else(occupation == "white", 1, 0)] 

# keep only the metadata that can be found in dtm 
speaker_metadata <- speaker_metadata[pid_session %in% rownames(dtm.counts)] 


# Construct penalized estimation inputs
# No intercept model

X           <- sparse.model.matrix(
  
  ~ 0 + session + factor(party),
  
  data = speaker_metadata
)

# # Do the QR decomposition
qx <- qr(as.matrix(X))
X <- X[, qx$pivot[1:qx$rank]] #Keep independent columns

# # The rows of X are the speaker_id
rownames(X) <- speaker_metadata$pid_session

# # Keeping only metadata of the speeches in the corpus: 
X <- X[rownames(dtm.counts), ]

# Create dummies for A interacted with parlamentary period
A   <- sparse.model.matrix(~ 0 + session:A, data = speaker_metadata)

rownames(A) <- speaker_metadata$pid_session #naming each row with speech name

A <- A[rownames(dtm.counts), ]

mu <- log(rowSums(dtm.counts)) #mu is the shape parameter of the poisson

# Estimate penalized MLE----

# Detect the number of available CPU cores
n.cores <- detectCores()

# Decide how many cores to leave free (e.g., 1 or 2)
cores_to_leave_free <- 2
n.cores <- max(1, n.cores - cores_to_leave_free) # Ensure at least 1 core is used

cl <- makeCluster(n.cores, type = ifelse(.Platform$OS.type == 'unix', 'FORK', 'PSOCK')) 

fit <- dmr( #distributed multinominal regression: https:\\arxiv.org/abs/1311.6139
  cl = cl, 
  covars = cbind(X, A),
  counts = dtm.counts,
  verb = 1, 
  mu = mu,
  free = 1:ncol(X),
  fixedcost = 1e-5,
  lambda.start = Inf,
  lambda.min.ratio = 1e-5,
  nlambda = 100,
  standardize = F
)
stopCluster(cl)


############# Coefs 1 (from penalised MLE) #######################

coefs   <- coef(fit, k = log(nrow(X)), corrected = F)

coefs_X <- coefs[colnames(X), ]
coefs_A <- coefs[colnames(A), ]

# Phrase-time-specific utility loadings for each phrase if spoken by speakers
# of category A.

I           <- sparse.model.matrix(~ 0 + session, data = speaker_metadata)
rownames(I) <- speaker_metadata$pid_session
I           <- I[rownames(dtm.counts), ]
phi         <- I %*% coefs_A

# Utility

# Compute utility of speech if all speakers have affiliation B
utility_B        <- cbind(1, X) %*% rbind(coefs['intercept', ], coefs_X)
# Compute utility of speech if all speakers have affiliation A 
utility_A        <- utility_B + phi

# Compute utility of speech for observed speakers and 
# speaker "clones" with the same speech and covariates but the opposite 
# affiliation. 
affiliation_matrix       <- replicate(ncol(dtm.counts), rowSums(A))
affiliation_matrix_clone <- (1 - affiliation_matrix)
utility_real       <- utility_B + affiliation_matrix * phi
utility_clone      <- utility_B + affiliation_matrix_clone * phi
utility            <- rbind(utility_real, utility_clone)
q                  <- exp(utility) / rowSums(exp(utility))

# Compute expected posterior that speaker and clone is A based on speech.
# Compute rho through the likelihood ratio, not from the q's.
party_ratio        <- rowSums(exp(utility_B)) / rowSums(exp(utility_A))
likelihood_ratio   <- party_ratio * exp(phi)
rho                <- likelihood_ratio / (1 + likelihood_ratio)
rho                <- rbind(rho, rho)

expected_posterior <- rowSums(rho * q)

# Indicate A/B affiliation for speakers and clones

affiliation_A <- rowSums(A) > 0
affiliation_A <- as.matrix(affiliation_A)
affiliation_A <- rbind(affiliation_A, 1 - affiliation_A)
affiliation   <- ifelse(affiliation_A, 'A', 'B')

# Indicate session of congress for speakers and clones
session        <- speaker_metadata$session
names(session) <- speaker_metadata$pid_session
session        <- session[rownames(dtm.counts)]
session        <- as.matrix(session)
session        <- rbind(session, session)

# Compute divergence
party_pi <- tapply(expected_posterior, list(affiliation, session), mean) 
pi       <- .5 * (party_pi['A', ] + (1 - party_pi['B', ]))
pi       <- data.frame(session = names(pi), pi = pi)


# Save results

write.csv(pi, file = paste(output.dir ,'occ_party.csv', sep = "/"), row.names = F, quote = F)










